knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.11 ✔ rsample 1.3.1
## ✔ dials 1.4.2 ✔ tailor 0.1.0
## ✔ infer 1.1.0 ✔ tune 2.0.1
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.4.1 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(reportRmd)
## systemfonts and textshaping have been compiled with different versions of Freetype. Because of this, textshaping will not use the font cache provided by systemfonts
library(sjPlot)
##
## Attaching package: 'sjPlot'
##
## The following object is masked from 'package:ggplot2':
##
## set_theme
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:scales':
##
## alpha, rescale
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(parallel)
library(finalfit)
##
## Attaching package: 'finalfit'
##
## The following object is masked from 'package:reportRmd':
##
## extract_labels
library(gtsummary)
library(mlbench)
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
library(rsample)
library(tune)
library(recipes)
library(yardstick)
library(parsnip)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-10
library(themis)
library(corrr)
library(performance)
##
## Attaching package: 'performance'
##
## The following objects are masked from 'package:yardstick':
##
## mae, rmse
library(utils)
library(see)
data_1 <- read_csv("canpath_imputed.csv")
## Rows: 41187 Columns: 93
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ID
## dbl (92): ADM_STUDY_ID, SDC_GENDER, SDC_AGE_CALC, SDC_MARITAL_STATUS, SDC_ED...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(data_1)
## [1] "ID" "ADM_STUDY_ID"
## [3] "SDC_GENDER" "SDC_AGE_CALC"
## [5] "SDC_MARITAL_STATUS" "SDC_EDU_LEVEL"
## [7] "SDC_EDU_LEVEL_AGE" "SDC_INCOME"
## [9] "SDC_INCOME_IND_NB" "SDC_HOUSEHOLD_ADULTS_NB"
## [11] "SDC_HOUSEHOLD_CHILDREN_NB" "HS_GEN_HEALTH"
## [13] "PA_TOTAL_SHORT" "PM_BMI_SR"
## [15] "HS_ROUTINE_VISIT_EVER" "HS_DENTAL_VISIT_EVER"
## [17] "HS_FOBT_EVER" "HS_COL_EVER"
## [19] "HS_SIG_EVER" "HS_SIG_COL_EVER"
## [21] "HS_POLYP_EVER" "HS_PSA_EVER"
## [23] "WH_CONTRACEPTIVES_EVER" "WH_HFT_EVER"
## [25] "WH_MENOPAUSE_EVER" "WH_HRT_EVER"
## [27] "WH_HYSTERECTOMY_EVER" "WH_OOPHORECTOMY_EVER"
## [29] "HS_MMG_EVER" "HS_PAP_EVER"
## [31] "DIS_HBP_EVER" "DIS_MI_EVER"
## [33] "DIS_STROKE_EVER" "DIS_ASTHMA_EVER"
## [35] "DIS_COPD_EVER" "DIS_DEP_EVER"
## [37] "DIS_DIAB_EVER" "DIS_LC_EVER"
## [39] "DIS_CH_EVER" "DIS_CROHN_EVER"
## [41] "DIS_UC_EVER" "DIS_IBS_EVER"
## [43] "DIS_ECZEMA_EVER" "DIS_SLE_EVER"
## [45] "DIS_PS_EVER" "DIS_MS_EVER"
## [47] "DIS_OP_EVER" "DIS_ARTHRITIS_EVER"
## [49] "DIS_CANCER_EVER" "DIS_HBP_FAM_EVER"
## [51] "DIS_MI_FAM_EVER" "DIS_STROKE_FAM_EVER"
## [53] "DIS_ASTHMA_FAM_EVER" "DIS_COPD_FAM_EVER"
## [55] "DIS_DEP_FAM_EVER" "DIS_DIAB_FAM_EVER"
## [57] "DIS_LC_FAM_EVER" "DIS_CH_FAM_EVER"
## [59] "DIS_CROHN_FAM_EVER" "DIS_UC_FAM_EVER"
## [61] "DIS_IBS_FAM_EVER" "DIS_ECZEMA_FAM_EVER"
## [63] "DIS_SLE_FAM_EVER" "DIS_PS_FAM_EVER"
## [65] "DIS_MS_FAM_EVER" "DIS_OP_FAM_EVER"
## [67] "DIS_ARTHRITIS_FAM_EVER" "DIS_CANCER_FAM_EVER"
## [69] "DIS_CANCER_F_EVER" "DIS_CANCER_M_EVER"
## [71] "DIS_CANCER_SIB_EVER" "DIS_CANCER_CHILD_EVER"
## [73] "ALC_EVER" "SMK_CIG_EVER"
## [75] "SMK_CIG_WHOLE_EVER" "DIS_ENDO_HB_CHOL_EVER"
## [77] "DIS_CARDIO_HD_EVER" "DIS_RESP_SLEEP_APNEA_EVER"
## [79] "DIS_MH_ANXIETY_EVER" "DIS_MH_ADDICTION_EVER"
## [81] "DIS_NEURO_MIGRAINE_EVER" "PSE_ADULT_WRK_DURATION"
## [83] "PSE_WRK_FREQ" "WRK_FULL_TIME"
## [85] "WRK_PART_TIME" "WRK_RETIREMENT"
## [87] "WRK_HOME_FAMILY" "WRK_UNABLE"
## [89] "WRK_UNEMPLOYED" "WRK_UNPAID"
## [91] "WRK_STUDENT" "WRK_EMPLOYMENT"
## [93] "WRK_SCHEDULE_CUR_CAT"
summary(data_1$SDC_EDU_LEVEL)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 4.000 4.522 6.000 7.000
# check missing in education level
sum(is.na(data_1$SDC_EDU_LEVEL))
## [1] 0
data <- data_1 |> select(ID,
SDC_AGE_CALC,
PA_TOTAL_SHORT,
PM_BMI_SR,
SDC_EDU_LEVEL_AGE,
PSE_ADULT_WRK_DURATION,
SDC_INCOME,
SDC_GENDER)
data$SDC_INCOME <- as.factor(data_1$SDC_INCOME)
data$SDC_GENDER <- as.factor(data_1$SDC_GENDER)
rm_covsum(data=data,
covs=c('SDC_AGE_CALC','PA_TOTAL_SHORT', 'PM_BMI_SR', 'SDC_EDU_LEVEL_AGE', 'PSE_ADULT_WRK_DURATION'))
| n=41187 | |
|---|---|
| SDC AGE CALC | |
| Mean (sd) | 51.5 (10.8) |
| Median (Min,Max) | 52 (30, 74) |
| PA TOTAL SHORT | |
| Mean (sd) | 2606.5 (2691.8) |
| Median (Min,Max) | 1800 (0, 19278) |
| PM BMI SR | |
| Mean (sd) | 27.7 (6.2) |
| Median (Min,Max) | 26.6 (8.9, 69.4) |
| SDC EDU LEVEL AGE | |
| Mean (sd) | 25.3 (9.2) |
| Median (Min,Max) | 23 (-7, 73) |
| PSE ADULT WRK DURATION | |
| Mean (sd) | 6.7 (9.5) |
| Median (Min,Max) | 2 (0, 51) |
## Detailed summary statistics???
rm_covsum(data=data,
covs=c('SDC_AGE_CALC','PA_TOTAL_SHORT', 'PM_BMI_SR', 'SDC_EDU_LEVEL_AGE', 'PSE_ADULT_WRK_DURATION'))
| n=41187 | |
|---|---|
| SDC AGE CALC | |
| Mean (sd) | 51.5 (10.8) |
| Median (Min,Max) | 52 (30, 74) |
| PA TOTAL SHORT | |
| Mean (sd) | 2606.5 (2691.8) |
| Median (Min,Max) | 1800 (0, 19278) |
| PM BMI SR | |
| Mean (sd) | 27.7 (6.2) |
| Median (Min,Max) | 26.6 (8.9, 69.4) |
| SDC EDU LEVEL AGE | |
| Mean (sd) | 25.3 (9.2) |
| Median (Min,Max) | 23 (-7, 73) |
| PSE ADULT WRK DURATION | |
| Mean (sd) | 6.7 (9.5) |
| Median (Min,Max) | 2 (0, 51) |
data %>%
correlate() %>%
rearrange() %>%
shave() %>%
rplot(print_cor=TRUE)
## Non-numeric variables removed from input: `ID`, `SDC_INCOME`, and `SDC_GENDER`
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the corrr package.
## Please report the issue at <https://github.com/tidymodels/corrr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
pca_recipe <- recipe(~., data = data) %>%
update_role(ID, SDC_INCOME, SDC_GENDER, new_role = "id") %>%
step_scale(all_predictors()) %>%
step_center(all_predictors()) %>%
step_pca(all_predictors(), id = "pca_id")
pca_recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## predictor: 5
## id: 3
##
## ── Operations
## • Scaling for: all_predictors()
## • Centering for: all_predictors()
## • PCA extraction with: all_predictors()
pca_prepared <- prep(pca_recipe)
pca_prepared
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## predictor: 5
## id: 3
##
## ── Training information
## Training data contained 41187 data points and no incomplete rows.
##
## ── Operations
## • Scaling for: SDC_AGE_CALC, PA_TOTAL_SHORT, PM_BMI_SR, ... | Trained
## • Centering for: SDC_AGE_CALC, PA_TOTAL_SHORT, PM_BMI_SR, ... | Trained
## • PCA extraction with: SDC_AGE_CALC PA_TOTAL_SHORT, ... | Trained
pca_baked <- bake(pca_prepared, data)
pca_baked
## # A tibble: 41,187 × 8
## ID SDC_INCOME SDC_GENDER PC1 PC2 PC3 PC4 PC5
## <chr> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SYN_58621 6 2 0.226 -0.183 0.873 -0.161 -0.770
## 2 SYN_58622 6 2 -0.372 0.719 -0.184 -0.565 0.912
## 3 SYN_58623 4 2 0.792 -1.37 1.35 0.840 0.974
## 4 SYN_58624 3 2 -0.526 0.0250 0.418 -1.16 1.08
## 5 SYN_58625 4 2 1.45 1.80 -1.75 1.69 0.578
## 6 SYN_58626 4 2 -0.987 -1.62 1.14 1.10 -0.154
## 7 SYN_58627 5 2 -1.65 -0.294 0.240 -0.133 -0.466
## 8 SYN_58628 3 2 0.252 1.04 -0.707 -0.0320 1.20
## 9 SYN_58629 3 2 0.771 -0.578 -0.915 2.20 0.645
## 10 SYN_586210 5 2 0.241 -0.587 -1.47 0.00528 0.747
## # ℹ 41,177 more rows
pca_variables <- tidy(pca_prepared, id = "pca_id", type = "coef")
ggplot(pca_variables) +
geom_point(aes(x = value, y = terms, color = component))+
labs(color = NULL) +
geom_vline(xintercept=0) +
geom_vline(xintercept=-0.2, linetype = 'dashed') +
geom_vline(xintercept=0.2, linetype = 'dashed') +
facet_wrap(~ component) +
theme_minimal()
pca_variances <- tidy(pca_prepared, id = "pca_id", type = "variance")
pca_variance <- pca_variances |> filter(terms == "percent variance")
pca_variance$component <- as.factor(pca_variance$component)
pca_variance$comp <- as.numeric(pca_variance$component)
ggplot(pca_variance, aes(x = component, y = value, group = 1, color = component)) +
geom_point() +
geom_line() +
labs(x = "Principal Components", y = "Variance explained (%)") +
theme_minimal()
pca_cummul_variance <- pca_variances |> filter(terms == "cumulative percent variance")
pca_cummul_variance$component <- as.factor(pca_cummul_variance$component)
pca_cummul_variance$comp <- as.numeric(pca_cummul_variance$component)
ggplot(pca_cummul_variance, aes(x = component, y = value, group = 1, color = component)) +
geom_point() +
geom_line() +
labs(x = "Principal Components", y = "Cummulative Variance explained (%)") +
theme_minimal()
pca_corr <- pca_baked |> select(!(ID))
pca_corr %>%
correlate() %>%
rearrange() %>%
shave() %>%
rplot(print_cor=TRUE)
## Non-numeric variables removed from input: `SDC_INCOME`, and `SDC_GENDER`
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
ggplot(pca_baked, aes(x = PC1, y = PC2)) +
geom_point(alpha = 0.1) +
geom_smooth(method = "lm") +
labs(x = "PC1", y = "PC2") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
scatter_3d <- plot_ly(pca_baked, x = ~PC3, y = ~PC4, z = ~PC5, type = "scatter3d", mode = "markers",
marker = list(size = 3)) %>%
layout(title = "3D Scatter Plot",
scene = list(xaxis = list(title = "PC3"),
yaxis = list(title = "PC4"),
zaxis = list(title = "PC5")))
# Display the 3D scatter plot
scatter_3d